gun_df = read.csv("data/mass_shootings_2018_2024_cleaned.csv") |>
  janitor::clean_names()

Structure and Size of Data

The dataset contains information on gun-related incidents from 2018 to 2024 across different cities and states in the United States. There are a total of 3932 observation and 7 variables.

  • date: Date of the incident (class: Date)
  • city: City where the incident occurred (class: character)
  • state: State where the incident occurred (class: character)
  • dead: Number of fatalities (class: integer)
  • injured: Number of injuries (class: integer)
  • total: Total number of victims (Dead + Injured) (class: character)
  • description: Textual description of incident (class: character)
glimpse(gun_df) 
## Rows: 3,932
## Columns: 7
## $ date        <chr> "2024-11-18", "2024-11-17", "2024-11-17", "2024-11-16", "2…
## $ city        <chr> "Denver", "New Orleans", "Columbus", "Walthall County", "R…
## $ state       <chr> "Colorado", "Louisiana", "Ohio", "Mississippi", "Californi…
## $ dead        <int> 0, 0, 0, 1, 0, 3, 1, 3, 1, 5, 0, 1, 0, 1, 2, 0, 3, 5, 0, 0…
## $ injured     <int> 4, 10, 4, 4, 4, 1, 4, 1, 3, 0, 4, 12, 5, 3, 2, 5, 3, 0, 4,…
## $ total       <int> 4, 10, 4, 5, 4, 4, 5, 4, 4, 5, 4, 13, 5, 4, 4, 5, 6, 5, 4,…
## $ description <chr> "Four men were wounded at a strip mall following a dispute…

Comparing Number of Incidents by Year, Quarter, Month, and Weekday

By Year

First, we will compare the number of incidents by year.

gun_df = gun_df |>
  mutate(date = as.Date(date), 
         year = year(date)) 

gun_plot = gun_df |>
  ggplot(aes(x = as.factor(year), fill = factor(year))) +
  geom_bar() +
  scale_y_continuous(
    labels = scales::comma,
    limits = c(0, NA),  
    expand = expansion(mult = c(0, 0.15))  
  ) +
  geom_label(stat = "count", aes(label = after_stat(count), y = after_stat(count)), vjust = -0.2) +
  labs(
    title = "Number of Gun-Related Incidents by Year",
    x = "Year",
    y = "Number of Incidents"
  ) +
  theme(legend.position = "none")

print(gun_plot)

The graph shows a general upward trend in gun-related incidents from 2018 to 2022. The peak would be at 717 incidents. The year 2020 shows a significant increase, rising from 441 incidents in 2019 to 617 incidents, and the trend continues with 703 incidents in 2021. However, from 2023 to 2024, the number of incidents appear to decline, with 596 incidents in 2023 and 538 incidents in 2024. This suggests that while the number of incidents has increased over the past few years, the decline in recent years might reflect a shift in trends or external factors impacting the data. Further analysis would be needed to understand the variations in detail.

By Quarter

We will now compare the number of incidents by quarter for each year.

gun_df = gun_df |>
  mutate(date = as.Date(date, format = "%Y-%m-%d"),
         year = year(date),
         quarter = quarter(date)
  )

gun1_plot = gun_df |>
  ggplot(aes(x = factor(quarter), fill = factor(year))) +
  geom_bar(position = "dodge") + 
  labs(
    title = "Quarterly Number of Gun-Related Incidents (2014-2024)",
    x = "Quarter",
    y = "Number of Incidents",
    fill = "Year"
  )
print(gun1_plot)

Let’s further break it down by years and quarters.

q1 = gun_df |>
  filter(year != 2013) |>
  select(year, quarter) |>
  group_by(year) |>
  count(quarter) |>
  ggplot(aes(x = as.factor(quarter), y = n, fill = factor(quarter))) + 
  geom_bar(stat = 'identity') + 
  scale_y_continuous(labels = scales::comma) + 
  facet_grid(. ~ year) + 
  labs(x = 'Quarter by Year', y = 'Number of incidents') +
  theme(legend.position = "none")

q2 = gun_df |>
  filter(year != 2013 & quarter == 1) |>
  select(year, quarter) |>
  group_by(year) |>
  count(quarter) |>
  ggplot(aes(x = as.factor(year), y = n, fill = factor(year))) + 
  geom_bar(stat = 'identity') + 
  scale_y_continuous(labels = scales::comma) + 
  labs(x = 'Incidents in Q1 of each year', y = 'Number of incidents') +
  theme(legend.position = "none")

gridExtra::grid.arrange(q1, q2)

The dataset on incidents of gun violence from 2018-2024 show some seasonality. We notice incidents in Q1 and Q4 are generally lower than those in Q2 and Q3. The second graph highlights that Q1 incidents increased steadily from 2018 to 2021, with smaller increases observed between 2021 and 2022, as well as between 2022 and 2023, suggesting a leveling-off trend during this period. In 2024, there is a slight decrease compared to 2023, but it is important to consider that 2024 is not yet complete. This potential plateau in Q1 incidents could signal a stabilization, though the overall trend since 2018 indicates a significant increase that still warrants attention.

By Month

We will now compare the number of incidents by months.

gun_df$month <- month(gun_df$date, label = TRUE)

gun2_plot = plotly::ggplotly(
  gun_df |>
    filter(year != c(2013, 2018)) |>
    count(month) |>
    ggplot(aes(x = month, y = n, fill = month)) + 
    geom_bar(stat = 'identity') + 
    scale_y_continuous(labels = scales::comma) + 
    labs(x = 'Month', y = 'Number of incidents', title = 'Cumulative Incidents by Month') + 
    theme(legend.position = "none")
) 

gun2_plot

The graph shows the number of incidents by month. Again, we see a clear seasonal trend. Incidents steadily rise from January through July, peaking in July (likely due to heightened activity during the summer months). A sharp decline follows from August to December, with November and December showing the lowest numbers. This could be influenced by colder weather, holiday periods, or shorter months (i.e., February). The visible trends suggest a relationship between weather (seasonal variations), social behavior, and incident rates. You can hover over the bars for exact values to compare these trends in more detail.

By Date

Let’s see the dates with the most incidents.

gun_df$day = day(gun_df$date)  
gun_df = gun_df |> mutate(date2 = paste(month, day)) 

knitr::kable(
  gun_df |> 
    filter(year != c(2013, 2018)) |> 
    count(date2) |> 
    top_n(10) |> 
    arrange(desc(n)) |> 
    rename(date = date2, "total number of incidents" = n)
)
date total number of incidents
Jul 5 49
Jul 4 48
Jan 1 31
Jun 11 26
Jul 28 23
Jun 17 23
Jun 20 22
Jun 23 22
Jul 23 21
May 18 21

The table highlights the dates with the highest total number of incidents, aggregated over 2018-2024. July 5th and July 4th stand out with 49 and 48 incidents respectively, indicating that Independence Day celebrations significantly contribute to increased incidents, likely due to festivities and the use of fireworks or firearms. January 1st, with 31 incidents, suggests that New Year’s Day also brings heightened risk, potentially linked to post-midnight celebrations and gatherings. The remaining dates predominantly occur in late June and July, with totals ranging from 21 to 26 incidents, reflecting a seasonal peak in incidents during summer months. May 18th, while slightly earlier, still aligns closely with this summer trend, reinforcing the pattern of increased incidents during warmer months. These results show the influence of holidays and seasonal trends on incident rates.

By Weekday

We will now compare the number of incidents by weekday.

gun_df$weekday = wday(gun_df$date, label = TRUE)

gun_df |> 
  count(weekday) |> 
  ggplot(aes(x = weekday, y = n, fill = weekday)) + 
  geom_bar(stat = 'identity') + 
  scale_y_continuous(labels = scales::comma) + 
  labs(x = 'Weekday', y = 'Number of incidents', title = 'Cumulative Incidents by Weekday') +
  theme(legend.position = "none")

Sunday stands out as the most dangerous day, with incidents far exceeding 900. Saturdays follow closely with around 900 incidents. Fridays show a moderate increase with about 450 incidents, marking the start of the weekend.Meanwhile, weekdays see significantly lower numbers, with Tuesday recording the fewest incidents at around 200, followed by Thursday at about 350, Wednesday at 375, and Monday at 440. This trend show the influence of weekends or leisure culture on the frequency of incidents, with weekdays being relatively safer perhaps due to structured routines and fewer late-night activities.

Incidents by State

plotly::ggplotly(gun_df %>% 
                   count(state) %>%
        ggplot(aes(x=reorder(state, n), y=n, fill=n, text=state, fill = state)) +
        geom_bar(stat='identity') +
        theme(axis.text.x = element_text(angle = 80, hjust = 1)) +
        labs(x='', y='Number of incidents', title = 'Cumulative Incidents by State'),
        tooltip=c("text", "y"))

The interactive bar graph above shows absolute number of gun violence by states. Hovering over the bars shows a label with the absolute number of incidents for a state. Illinois has the highest number of gun violence cases followed by Texas and California.

Incidents Relative to State Population

statesPop <- read_csv("data/PopulationUS.csv")

statesPop <- statesPop %>% 
  select(NAME, POPESTIMATE2017)

statesPop <- statesPop %>% 
  filter(!NAME %in% c("United States", 
                      "Puerto Rico Commonwealth"))

statesPop <- statesPop %>% 
  rename(state=NAME)

statesPop$state <- as.factor(statesPop$state)
incidentsByState <- gun_df %>% 
  group_by(state) %>%
  count(state)

incidentsByState <- left_join(incidentsByState, 
                             statesPop, 
                             by="state")

incidentsByState <- incidentsByState %>%  
  filter(state != "Washington, D.C.") 

incidentsByState$Per100000 <- ((incidentsByState$n/incidentsByState$POPESTIMATE2017)*100000)

kable(head(incidentsByState))
state n POPESTIMATE2017 Per100000
Alabama 115 4874747 2.3590968
Alaska 6 739795 0.8110355
Arizona 46 7016270 0.6556190
Arkansas 49 3004279 1.6310070
California 297 39536653 0.7512017
Colorado 71 5607154 1.2662395

This table provides the numerical values for the number of incidents, 2017 population, and the scaled number of incidents per 100,000 people.

plot <- ggplot(incidentsByState, aes(x = reorder(state, Per100000), y = Per100000, fill = Per100000)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(x = '', y = 'Incidents per 100,000 inhabitants', title = 'Cumulative Incidents by State Relative to Population') +
  theme_minimal() +
  theme(legend.position = "none")

ggplotly(plot, tooltip = c("y", "x"))

This is the Number of Incidents scaled to state population. The order changes quite drastically where the District of Columbia, Louisiana, and Mississippi now lead and Idaho, Wyoming, and New Hampshire now have the lowest though the bottom is much more concentrated.

Interactive Map of Incidents by State

states <- st_read(dsn = "data/cb_2017_us_state_500k/cb_2017_us_state_500k.shp", quiet = FALSE, stringsAsFactors = TRUE)

# class(states)

kable(head(st_drop_geometry(states)))
addPer100k <- data.frame(id=states$GEOID, 
                         name=states$NAME)

addPer100k <- left_join(addPer100k, incidentsByState %>% 
                          select(state, Per100000) %>% 
                          rename(name=state), by="name")

addPer100k$Per100000[is.na(addPer100k$Per100000)] <- 0

states$per100k <- addPer100k$Per100000
bins <- c(0, 0.5, 1.0, 1.5, 2.0, Inf)

pal <- colorBin("YlOrRd", 
                domain = states$per100k, 
                bins = bins)

state_popup <- paste0("<strong>State: </strong>", 
                      states$NAME, 
                      "<br><strong>Incidents per 100,000 inhabitants </strong>", 
                      states$per100k) %>% 
  
  lapply(htmltools::HTML)

leaflet(data = states) %>%
  
        setView(lng=-96, 
                lat=37.8, 
                zoom=4) %>%
  
        addProviderTiles("MapBox", options = providerTileOptions(id = "mapbox.light",
                accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>%
  
        addPolygons(
                fillColor = ~pal(per100k),
                weight = 2,
                opacity = 1,
                color = "white",
                dashArray = "3",
                fillOpacity = 0.7,
                highlight = highlightOptions(
                        weight = 5,
                        color = "#666",
                        dashArray = "",
                        fillOpacity = 0.7,
                        bringToFront = TRUE),
                label = state_popup,
                labelOptions = labelOptions(
                        style = list("font-weight" = "normal", 
                                     padding = "3px 8px"),
                        textsize = "15px",
                        direction = "auto")) %>%
  
        addLegend(pal = pal, 
                  values = ~per100k, 
                  opacity = 0.7, 
                  title = "Incidents", 
                  position = "bottomright")

Here an interactive map which displays the same data. It is scaled via a discrete legend so the heat map is slightly different than viridis but it conveys the same data in a more engaging format.

Victims by State

VictimsByState <- gun_df %>% 
  group_by(state) %>% 
  summarize(total_victims=sum(total), 
            total_injured=sum(injured), 
            total_dead=sum(dead), 
            percent_dead=round(total_dead/total_victims,2), 
            total_incidence=n(), 
            percent_victims_per_incidents=round(total_victims/total_incidence,2))

VictimsByState_table = VictimsByState |>
  head(10) |>
  knitr::kable()

VictimsByState_table
state total_victims total_injured total_dead percent_dead total_incidence percent_victims_per_incidents
Alabama 647 516 131 0.20 115 5.63
Alaska 26 14 12 0.46 6 4.33
Arizona 259 193 67 0.26 46 5.63
Arkansas 265 208 57 0.22 49 5.41
California 1564 1174 390 0.25 297 5.27
Colorado 382 278 104 0.27 71 5.38
Connecticut 102 89 13 0.13 24 4.25
Delaware 74 60 14 0.19 17 4.35
District of Columbia 270 233 37 0.14 57 4.74
Florida 1121 855 266 0.24 211 5.31

This table provides the number of victims by state and the number of victims per incident by state. This shows how many people were involved in the average incident by state.

VictimsByState %>% ggplot(aes(x = reorder(state, 
                         -percent_victims_per_incidents), 
             y = percent_victims_per_incidents, 
             fill = percent_victims_per_incidents)) + 
  
  geom_bar(stat = 'identity') +
  
  labs(x = '', 
       y = 'Victims per incidents',
       fill = 'Percent of Total Victims per Incident', 
       title = 'Cumulative Percent of Total Victims per Incident by State') +
  
  theme(axis.text.x = element_text(angle = 80, 
                                   hjust = 1))

The bar graph above shows the number of victims per incident by state. Maine has the highest victims by incidents followed by Washington, DC followed by Puerto Rico.

VictimsByState <- left_join(VictimsByState, 
                           statesPop, 
                           by="state")

VictimsByState$Per100000 <- round((VictimsByState$total_victims/VictimsByState$POPESTIMATE2017)*100000)


plotly::ggplotly(VictimsByState %>% 
                   
                   filter(state!="District of Columbia") %>%
                   
        ggplot(aes(x=reorder(state, Per100000), y=Per100000, fill=Per100000, text=state)) +
          
        geom_bar(stat='identity') + 
        
        coord_flip() +
          
        labs(x = '', 
             y = 'Victims per 100,000 inhabitants',
             fill = 'Total Victims per Incident', 
             title = 'Cumulative Total Victims per Incident Relative to State Population') + 
          
        theme(legend.position="none"),
        tooltip=c("text", "y"))

This shows the average number of victims per incident when we scale by state. There is another dramatic reordering with Louisiana, Mississippi, and Illinois now taking the top spot. There is no data for DC.

Interactive Map of Victims by State

addPer100k <- addPer100k %>% select(id, name)

addPer100k <- left_join(addPer100k, VictimsByState %>% 
                          select(state, Per100000) %>% 
                          rename(name=state), 
                        by="name")

addPer100k$Per100000[is.na(addPer100k$Per100000)] <- 0

states$per100k <- addPer100k$Per100000
bins <- c(0, 5, 10, 15, 20, Inf)
pal <- colorBin("YlOrRd", domain = states$per100k, bins = bins)

state_popup <- paste0("<strong>State: </strong>", 
                      states$NAME, 
                      "<br><strong>Victims per 100,000 inhabitants </strong>", 
                      states$per100k) %>% lapply(htmltools::HTML)

leaflet(data = states) %>%
  
        setView(lng=-96, 
                lat=37.8, 
                zoom=4) %>%
  
        addProviderTiles("MapBox", 
                         options = providerTileOptions(id = "mapbox.light",
                accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>%
  
        addPolygons(
                fillColor = ~pal(per100k),
                weight = 2,
                opacity = 1,
                color = "white",
                dashArray = "3",
                fillOpacity = 0.7,
                highlight = highlightOptions(
                        weight = 5,
                        color = "#666",
                        dashArray = "",
                        fillOpacity = 0.7,
                        bringToFront = TRUE),
                label = state_popup,
                labelOptions = labelOptions(
                        style = list("font-weight" = "normal", padding = "3px 8px"),
                        textsize = "15px",
                        direction = "auto")) %>%
  
        addLegend(pal = pal, 
                  values = ~per100k, 
                  opacity = 0.7, 
                  title = "Victims Per Incident", 
                  position = "bottomright")

Once again this is an interative map which helps visalize and reinforce the previous information.

Incidents by City

Here are the top number of incidents by city.

incidentsByCity <- gun_df %>% 
  select(city, state) %>% 
  group_by(city, state) %>% 
  summarize(cityIncidents=n())

incidentsByCity_table = incidentsByCity |>
  head(10) |>
  knitr::kable()

incidentsByCity_table
city state cityIncidents
Abbeville South Carolina 1
Aberdeen Maryland 1
Abington Massachusetts 1
Adelanto California 1
Adelphi Maryland 1
Aguanga California 1
Aiken South Carolina 2
Akron Ohio 6
Alachua Florida 1
Alameda California 1

Here are the top number of incidents per city within New York State.

incidents_nystate <- incidentsByCity[incidentsByCity$state == 'New York', ] 

incidents_nystate_table = incidents_nystate |>
  head(10) |>
  knitr::kable()

incidents_nystate_table
city state cityIncidents
Albany New York 10
Bay Shore New York 2
Buffalo New York 17
Copiague New York 1
Freeport New York 1
Hempstead New York 2
Lockport New York 1
Mount Vernon New York 1
New City New York 1
New Rochelle New York 1

Here are the number of incidents per New York City.

incidents_nyc <- incidentsByCity[(incidentsByCity$city %in% c('New York City')) & 
                                   incidentsByCity$state=='New York',]

incidents_nyc_table = incidents_nyc |>
  knitr::kable()

For a city of 8 million people this seems quite low. This suggests that crime has fallen dramatically, crime is under-reported, or that the data collection mechanism is selective.

threshold <- incidentsByCity %>%
  pull(cityIncidents) %>%
  sort(decreasing = TRUE) %>%
  nth(50) 

top_cities <- incidentsByCity %>%
  filter(cityIncidents >= threshold) %>% 
  
  arrange(desc(cityIncidents))  

top_cities %>%
  ggplot(aes(x = reorder(city, cityIncidents), 
             y = cityIncidents, 
             fill = cityIncidents)) + 
    geom_bar(stat = 'identity') +
    labs(x = '', 
         y = 'Number of Incidents', 
         title = 'Top 50 Cities by Number of Incidents',
         fill = 'Number of Incidents') +
    coord_flip()

The bar graph above shows the number of incidents by city. Chicago has the highest number of incidents followed by Philadelphia followed by New York City.

Merging Location Data for Incident Mapping

We are merging a new dataset obtained from Simple Maps US Cities to our gun_df dataset to add the variables latitude (lat) and longitude (lng) of all included US cities for visualizations.

uscities <- read.csv("data/simplemaps_uscities/uscities.csv") |>
  janitor::clean_names()

gun_df <- gun_df |>
  left_join(
    uscities |> select(city, state_name, lat, lng),
    by = c("city" = "city", "state" = "state_name")
  )

write.csv(gun_df, "gun_df.csv")

Top Incidents by Victim Count

top_incidents <- gun_df |> 
  select(date, city, state, dead, injured, total, lat, lng) |> 
  arrange(desc(total)) |> 
  slice(1:13) 

top_incidents_table = top_incidents |>
  knitr::kable()

top_incidents_table
date city state dead injured total lat lng
2022-07-04 Highland Park Illinois 7 48 55 42.1823 -87.8104
2019-08-03 El Paso Texas 23 22 45 31.8476 -106.4300
2022-05-24 Uvalde Texas 22 18 40 29.2152 -99.7782
2019-08-04 Dayton Ohio 10 27 37 39.7805 -84.2003
2023-04-15 Dadeville Alabama 4 32 36 32.8326 -85.7675
2018-02-14 Parkland Florida 17 17 34 26.3219 -80.2533
2023-10-25 Lewiston Maine 19 13 32 44.0915 -70.1681
2024-06-02 Akron Ohio 1 29 30 41.0798 -81.5219
2023-07-02 Baltimore Maryland 2 28 30 39.3051 -76.6144
2018-11-07 Thousand Oaks California 13 16 29 34.1914 -118.8756
2022-03-19 Dumas Arkansas 1 26 27 33.8834 -91.4857
2022-11-19 Colorado Springs Colorado 5 19 24 38.8674 -104.7605
2018-05-18 Santa Fe Texas 10 14 24 29.3889 -95.1003

The above shows the 13 most severe gun-related incidents in the dataset, ranked by the total number of victims (dead and injured). The incidents range from mass shootings at public events to tragic attacks in schools and other locations. Each entry provides details on the location, date, fatalities, injuries, and a brief description of the event, offering insights into the magnitude and context of these tragedies.

Interactive Map of Top Incidents by Victim Count

labels <- paste0(
  "<strong>Date:</strong> ", top_incidents$Date, "<br>",
  "<strong>City:</strong> ", top_incidents$city, "<br>",
  "<strong>State:</strong> ", top_incidents$state, "<br>",
  "<strong>Total Victims:</strong> ", top_incidents$Total, "<br>",
  "<strong>Description:</strong> ", top_incidents$Description
) |> lapply(htmltools::HTML)

leaflet(top_incidents) |>
  setView(lng=-96, lat=37.8, zoom=4)|>
  addTiles() |> 
  addProviderTiles("CartoDB.Positron") |>
  addCircleMarkers( ~lng, ~lat, color = "red", radius = ~sqrt(total), 
    label = labels, 
    fillOpacity = 0.7
  )